Load raw data (from https://www.kaggle.com/ludobenistant/hr-analytics), add factors, save.
HR <- read_csv("HR_comma_sep.csv", col_types=cols(
satisfaction_level = col_double(),
last_evaluation = col_double(),
number_project = col_integer(),
average_montly_hours = col_integer(),
time_spend_company = col_integer(),
Work_accident = col_integer(),
left = col_integer(),
promotion_last_5years = col_integer(),
sales = col_character(),
salary = col_character()
))
HR$salary <- factor(HR$salary, levels=c("low", "medium", "high"), ordered=TRUE)
HR$sales <- factor(HR$sales)
# HR$location <- factor(sapply(HR$sales, location))
HR$Work_accident_ <- factor(HR$Work_accident == 1)
HR$promotion_last_5years_ <- factor(HR$promotion_last_5years == 1)
HR$left_ <- factor(HR$left==1)
# write_csv(HR, "HR.csv")
summary(HR)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## time_spend_company Work_accident left
## Min. : 2.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 3.000 Median :0.0000 Median :0.0000
## Mean : 3.498 Mean :0.1446 Mean :0.2381
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :10.000 Max. :1.0000 Max. :1.0000
##
## promotion_last_5years sales salary Work_accident_
## Min. :0.00000 sales :4140 low :7316 FALSE:12830
## 1st Qu.:0.00000 technical :2720 medium:6446 TRUE : 2169
## Median :0.00000 support :2229 high :1237
## Mean :0.02127 IT :1227
## 3rd Qu.:0.00000 product_mng: 902
## Max. :1.00000 marketing : 858
## (Other) :2923
## promotion_last_5years_ left_
## FALSE:14680 FALSE:11428
## TRUE : 319 TRUE : 3571
##
##
##
##
##
plot.new()
png(filename="hr-scatter.png", res=300, width = 3000, height = 3000)
my_colors <- brewer.pal(3, "Set2")
samp.sz <- 3000
pct <- round(100*samp.sz/nrow(HR))
data <- HR[sample(nrow(HR), samp.sz),]
scatterplotMatrix(~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+sales+salary|left, data=data, reg.line="", smoother="", col=my_colors , smoother.args=list(col="grey") , pch=c(3,4), legend.plot=FALSE, main=paste("Scatter Plot Pairs for HR Turnover (",pct,"% of data)", sep=""))
par(xpd=TRUE, cex=0.7)
legend(x=0.92, y=1.16, c("Stay", "Leave"), col=my_colors, pch=c(3,4), horiz=TRUE)
dev.off()
Leaving is negatively correlated with:
Leaving is positively correlated with:
In addition:
M <- cor(HR[1:8])
corrplot(M, method = 'pie', order ="hclust",
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)
M <- cor(HR[HR$left_==TRUE,c(1:6,8)])
corrplot(M, method = 'pie', order ="hclust",
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)
M <- cor(HR[HR$left_==FALSE,c(1:6,8)])
corrplot(M, method = 'pie', order ="hclust",
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45, addrect=2)
Many people leaving have been with the company at least 4 years but have not had a promotion in the last 5 years, despite working long hours.
with(HR, coplot(average_montly_hours ~ jitter(time_spend_company) |
promotion_last_5years_ + left_))
These unpromoted, long-term workers who leave are receiving the highest evaluations.
with(HR, coplot(last_evaluation ~ jitter(time_spend_company) |
promotion_last_5years_ + left_))
These unpromoted, long-term workers who leave are also reporting high levels of satisfaction with the company.
with(HR, coplot(satisfaction_level ~ jitter(time_spend_company) |
promotion_last_5years_ + left_))
Of the unpromoted workers putting in long hours who are leaving, some report the lowest levels of satisfaction, but more report very high levels of satisfaction.
The ones who are working the fewest hours report fairly low satisfaction.
with(HR, coplot(average_montly_hours ~ satisfaction_level |
promotion_last_5years_ + left_))
The staff on the largest number of projects and who put in the longest hours, with no promotion in 5 years, tend to leave.
with(HR, coplot(average_montly_hours ~ jitter(number_project) |
promotion_last_5years_ + left_))
Of the unpromoted workers who left, many had both high evaluations and high levels of satisfaction.
with(HR, coplot(last_evaluation ~ satisfaction_level |
promotion_last_5years_ + left_))
Plot: evaluation, satisfaction | hours + left_ evaluation, satisfaction | time with co + left_ evaluation, satisfaction | number_project + left_